import kagglehub
from kagglehub import KaggleDatasetAdapter
import kagglehub

# Download latest version
path = kagglehub.dataset_download("matthieugimbert/french-bakery-daily-sales")

print("Path to dataset files:", path)
/Users/rajivsambasivan/Programming/r2ds-blog-worskpace/r2ds-blog/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Path to dataset files: /Users/rajivsambasivan/.cache/kagglehub/datasets/matthieugimbert/french-bakery-daily-sales/versions/1
import pandas as pd
# kagglehub, please make this simpler, provide some example, really poor documentation for the api"
fp = path + "/Bakery Sales.csv"
df = pd.read_csv(fp)
cols = df.columns.tolist()
cols.remove("Unnamed: 0")
df = df[cols]
df.head()
date time ticket_number article Quantity unit_price
0 2021-01-02 08:38 150040.0 BAGUETTE 1.0 0,90 €
1 2021-01-02 08:38 150040.0 PAIN AU CHOCOLAT 3.0 1,20 €
2 2021-01-02 09:14 150041.0 PAIN AU CHOCOLAT 2.0 1,20 €
3 2021-01-02 09:14 150041.0 PAIN 1.0 1,15 €
4 2021-01-02 09:25 150042.0 TRADITIONAL BAGUETTE 5.0 1,20 €
filter_baguette = (df.article == "BAGUETTE")
df["datetime"] = pd.to_datetime(df["date"]  + " " + df["time"])
df_baguette = df[filter_baguette]
cols = ["datetime", "Quantity"]
df_baguette = df_baguette[cols]
df_daily_baugette = df_baguette.set_index("datetime").resample("D").sum()
df_weekly_baugette = df_baguette.set_index("datetime").resample("W").sum()
df_weekly_baugette = df_weekly_baugette.reset_index()
df_weekly_baugette
datetime Quantity
0 2021-01-03 81.0
1 2021-01-10 165.0
2 2021-01-17 168.0
3 2021-01-24 160.0
4 2021-01-31 167.0
... ... ...
87 2022-09-04 205.0
88 2022-09-11 175.0
89 2022-09-18 207.0
90 2022-09-25 141.0
91 2022-10-02 140.0

92 rows × 2 columns

df_weekly_baugette["month"] = df_weekly_baugette.datetime.dt.month
df_weekly_baugette["WOY"] = df_weekly_baugette.datetime.dt.isocalendar().week
df_weekly_baugette["CWOY"] = df_weekly_baugette.apply(lambda x: x["WOY"] + 53 if x.datetime.year == 2022 else x["WOY"], axis=1)
zero_counts = (df_daily_baugette.Quantity == 0)
df_daily_baugette[zero_counts].shape

fp = "../data/weekly_baguette_sales.csv"
df_weekly_baugette.to_csv(fp, index=False)
df_daily_baugette = df_daily_baugette.reset_index()
df_daily_baugette["Day"] = df_daily_baugette.index
df_daily_baugette["DOW"] = df_daily_baugette.datetime.dt.day_of_week
df_daily_baugette["month"] = df_daily_baugette.datetime.dt.month
df_daily_baugette["WOY"] = df_daily_baugette.datetime.dt.isocalendar().week
cols = ["Day","datetime", "DOW", "month", "WOY", "Quantity"]
df_daily_baugette = df_daily_baugette[cols]
df_daily_baugette.shape
(637, 6)
from matplotlib import pyplot as plt
plt.figure(figsize=(12, 6))
df_daily_baugette["Quantity"].plot()

from statsmodels.tsa.seasonal import STL
daily_baugette_sales = pd.Series(df_daily_baugette["Quantity"].values, index=df_daily_baugette["datetime"])
stl = STL(daily_baugette_sales, period=7)
res = stl.fit()
decomp_res = {"Trend": res._trend, "Seasonality": res._seasonal, "Noise": res._resid}
df_res = pd.DataFrame.from_dict(decomp_res, orient="columns")
df_res = df_res.reset_index()
df_res
datetime Trend Seasonality Noise
0 2021-01-02 28.179642 11.289480 6.530878
1 2021-01-03 27.405646 10.434721 -2.840368
2 2021-01-04 26.674892 6.537131 -3.212023
3 2021-01-05 25.977024 2.894178 0.128798
4 2021-01-06 25.312682 -24.668598 -0.644084
... ... ... ... ...
632 2022-09-26 27.396448 -15.522747 6.126299
633 2022-09-27 28.848273 2.410161 2.741566
634 2022-09-28 30.336600 -8.081090 0.744491
635 2022-09-29 31.840976 6.798708 -8.639685
636 2022-09-30 33.351069 -3.845030 5.493961

637 rows × 4 columns

df_daily_baugette["Trend"] = df_res["Trend"].round(3)
df_daily_baugette["Seasonality"] = df_res["Seasonality"].round(3)
zero_counts = (df_daily_baugette.Quantity == 0)
df_daily_baugette[zero_counts].shape
(37, 8)
# Using plotly.express
import plotly.express as px
fig = px.line(df_daily_baugette, x='datetime', y="Quantity", markers=True)
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()
# Using plotly.express
import plotly.express as px
fig = px.line(df_res, x='datetime', y="Trend", title="Trend Cycle Component of Daily Baugette Sales",
             labels = {"Trend": "Number of Baugettes", "datetime": "date"})# Using plotly.express
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()
# Using plotly.express
import plotly.express as px
fig = px.line(df_res, x='datetime', y="Seasonality", title="Seasonality Component of Daily Baugette Sales",
             labels = {"Trend": "Number of Baugettes", "datetime": "date"})# Using plotly.express
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()
# Using plotly.express
import plotly.express as px
fig = px.line(df_res, x='datetime', y="Noise", title="Noise Component of Daily Baugette Sales",
             labels = {"Noise": "Number of Baugettes", "datetime": "date"})# Using plotly.express
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()
df_res["Trend"].plot.kde()
plt.grid(True)

df_res["Seasonality"].plot.kde()
plt.grid(True)
import statsmodels.api as sm
plt.rc("figure", figsize=(12,8))
acf_plot = sm.graphics.tsa.plot_acf(df_daily_baugette["Quantity"] , lags=40)
plt.grid(True)
plt.rc("figure", figsize=(12,8))
pacf_plot = sm.graphics.tsa.plot_pacf(df_daily_baugette["Quantity"], lags=40, method="ywm")
plt.grid(True)
df_baguette = df_baguette.reset_index(drop=True)
df_baguette = df_baguette.rename(columns={"index": "Day"})
df_baguette["Day"] = df_baguette.index + 1
df_baguette["DOW"] = df_baguette.datetime.dt.day_of_week
df_baguette["month"] = df_baguette.datetime.dt.month
df_daily_baugette["CWOY"] = df_daily_baugette.apply(lambda x: x["WOY"] + 53 if x.datetime.year == 2022 else x["WOY"], axis=1)
df_daily_baugette["CWOY"].max()
df_daily_baugette
df_period_perf  = pd.pivot_table(df_daily_baugette, index= "CWOY", columns= "DOW", values = "Quantity", fill_value=0) 
df_period_perf = df_period_perf.apply(lambda x: x.div(x.sum()), axis=1).round(3).reset_index()
df_period_perf = df_period_perf.melt(id_vars=["CWOY"], value_vars=[i for i in range(7)], value_name="prop")
df_period_perf
df_daily_baugette = df_daily_baugette.merge(df_period_perf, on = ["CWOY", "DOW"])
df_daily_baugette.columns
cols_for_modeling = df_daily_baugette.columns.tolist()
cols_for_modeling = [c for c in cols_for_modeling if c not in ["datetime", "WOY", "Trend", "Seasonality"]]
df_daily_baugette = df_daily_baugette[cols_for_modeling]
fp = "../data/daily_baugette_sales.csv"
df_daily_baugette.to_csv(fp, index=False)
# Using plotly.express
import plotly.express as px
fig = px.scatter(df_weekly_baugette, x='CWOY', y="Quantity")
fig.update_layout(
    autosize=False,
    width=1100,
    height=800,
)
fig.show()